#crontab -e 编辑任务，输入下面，会在0:00附近开始执行爬虫，清洗，导入代码
#0	0	*	* 	* job /home/job/scripts/start_job.sh
echo  "************** 开始处理:  `date "+%Y-%m-%d %H:%M:%S"` **************"

#
# 修改导入JOB_TABLE变量，测试时导入的是job2表
# 修改 路径
#爬取保存在data/crawl目录
#清洗保存在data/clean目录
#用日期命名

#日志输出在logs/目录(start_job.sh配置)
#
#-------------初始化 -----------------

#获取系统时间作为保存文件名
DATE_NAME=`date "+%Y-%m-%d"`





#定义一些变量
#清洗jar包

#----------MySQL配置----------
#mysql用户名
HOST=localhost
USERNAME=job
#mysql密码
PASSWORD=job
#表名
JOB_TABLE=job2
#----------数据目录配置----------
#爬虫工程的目录
CRAW_HOME=/home/gg/scripts/qccaw
#数据保存目录
DATA_DIR=/home/gg/scripts/data


#爬虫数据目录
CRAW_DATA=${DATA_DIR}/crawl
#清洗输出目录
CLEAN_DATA=${DATA_DIR}/cleaned

echo 处理文件名:${DATE_NAME}

##----------清洗配置----------
#清洗最少城市数量
CITY_MIN=10
#jar包地址
CLEAN_JAR=file:///home/gg/scripts/jobclean.jar


#-----------hadoop配置-------
#sqoop目录
SQOOP=/opt/sqoop
#spark目录
SPARK=/opt/spark

#自动创建目录
	if [ ! -d ${CRAW_DATA} ] 
	then 
	echo "爬虫目录文件不存在，自动创建:" ${CRAW_DATA}
	mkdir -p ${CRAW_DATA}
	fi

	if [ ! -d ${DATA_DIR} ] 
	then 
	echo "清洗目录文件不存在，自动创建:" ${DATA_DIR}
	mkdir -p ${DATA_DIR}
	fi
	


#一.爬虫
#1)进入虚拟环境
echo "进入爬虫项目${CRAW_HOME}目录..."
cd ${CRAW_HOME}
source venv/bin/activate

#2) 启动scrapy
cd ${CRAW_HOME} 

echo "************** 开始采集:  `date "+%Y-%m-%d %H:%M:%S"` **************"
#scrapy crawl n1 -a savepath=${CRAW_DATA}/${DATE_NAME}.csv  -a   cate_data=all_cate.json --loglevel=WARN

scrapy crawl n1 -a savepath=${CRAW_DATA}/${DATE_NAME}.csv  -a   cate_data=test_data.json --loglevel=WARN


echo "************** 开始清洗:  `date "+%Y-%m-%d %H:%M:%S"` **************"

#二. 清洗, 输入路径 输出路径
/opt/spark/bin/spark-submit --class bigdata.jobclean.JobCleaner  --master local[2] ${CLEAN_JAR} file://${CRAW_DATA}/${DATE_NAME}.csv file://${CLEAN_DATA}/${DATE_NAME}/job  ${CITY_MIN}

echo "************** 清空job表数据:  `date "+%Y-%m-%d %H:%M:%S"` **************"
#导入mysql
${SQOOP}/bin/sqoop eval --connect "jdbc:mysql://localhost:3306/job?useUnicode=true&characterEncoding=utf-8" --password ${PASSWORD} --username ${USERNAME}  -e "truncate table job.${JOB_TABLE}"

echo "************** 导入job数据到MySQL:  `date "+%Y-%m-%d %H:%M:%S"` **************"
${SQOOP}/bin/sqoop export  --skip-dist-cache    --connect "jdbc:mysql://localhost:3306/job?useUnicode=true&characterEncoding=utf-8" --password ${PASSWORD} --username ${USERNAME} --table ${JOB_TABLE} --export-dir file://${CLEAN_DATA}/${DATE_NAME}/job/ --columns "url,name,salary,province,city,exp,edu,num,pubtime,cname,ctype,ctrade,cnum,cate1,cate2,welfare"  

echo "************** 清洗工作词云:  `date "+%Y-%m-%d %H:%M:%S"` **************"
#清洗词云
${SPARK}/bin/spark-submit --class bigdata.jobclean.WordCount2  --master local[2] ${CLEAN_JAR} file://${CLEAN_DATA}/${DATE_NAME}/job  file://${CLEAN_DATA}/${DATE_NAME}/cate_wc


echo "************** 清空词云表:  `date "+%Y-%m-%d %H:%M:%S"` **************"
#清空词云表
${SQOOP}/bin/sqoop eval --connect "jdbc:mysql://localhost:3306/job?useUnicode=true&characterEncoding=utf-8" --password ${PASSWORD} --username ${USERNAME}  -e "truncate table job.cate_wc"

echo "************** 导入词云数据到MySQL:  `date "+%Y-%m-%d %H:%M:%S"` **************"
#导入词云数据
${SPARK}/bin/spark-submit --class exportData  --master local[2] ${CLEAN_JAR}  file://${CLEAN_DATA}/${DATE_NAME}/cate_wc ${HOST} ${USERNAME} ${PASSWORD}

echo "************** 全部完成  `date "+%Y-%m-%d %H:%M:%S"` **************"



